knitr document van Steensel lab

Introduction

86 TFs were chosen to desing TF reporters for. In this analysis, I will analyse the motifs of the chosen TFs. The goal is to find out how unique the motifs are and how well they represent the motif landscape.

Setup

Libraries

Collect motifs

Import all motifs as PWMs

seq_logos <- list()
for(i in 1:nrow(all_motifs)) {
  
  x <- all_motifs$motif_id[i]
  file_x <- paste("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/PWMs/", x, sep = "")
  
  if (file.exists(file_x) == T) {
    
    if (str_contains(x, "txt") == TRUE) {
    pwm <- read.table(file_x, header = T) %>% dplyr::select(-Pos) %>% t()
    seq_logos[i] <- list(pwm)
    }
    
    if (str_contains(x, "pcm") == TRUE)  {
    pwm <- read.table(file_x, header = F) %>% column_to_rownames("V1")
    pwm <- as.matrix(pwm)
    seq_logos[i] <- list(pwm)
    }
    
    if (str_contains(x, "jaspar") == TRUE) {
    pwm <- read_jaspar(file_x)
    pwm <- pwm@motif
    # Transform to relative values
    pwm <- colPercents(pwm)/100
    pwm <- pwm[1:4,]
    seq_logos[i] <- list(pwm)
    }
  }
}

# Rename pwms
names(seq_logos) <- all_motifs$motif_id

# Remove motifs for which pwms were not available
seq_logos <- seq_logos[!sapply(seq_logos,is.null)]

# Convert to universalmotif format
seq_logos_motif <- list()
for (i in names(seq_logos)) {
  seq_logos_motif[[i]] <- universalmotif::convert_motifs(seq_logos[[i]])
  seq_logos_motif[[i]]@name <- i
}

saveRDS(seq_logos, "/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/seq_logos.rds")


# ## Alternative motif collection
# motifs <- convert_motifs(MotifDb)
# motifs <- filter_motifs(motifs,organism="Hsapiens")

Compute pairwise similarities

# Compute similarities using universalmotif function
motif_sim <- compare_motifs(seq_logos_motif, method = "PCC") ## Methods that work well: ALLR, ALLR_LL, PCC

Fig 1A: UMAP visualization of chosen motifs within motif clusters

Aim: Show diversity of chosen motifs.

### UMAP analysis
set.seed(35425) 
motif_cor <- motif_sim %>%
  replace(is.na(.), 0)

motifs_selected <- rbind(gen1_motifs, gen2_motifs) %>%
  mutate(motif_id = ifelse(tf == "SOX9", "M06107_1.94d.txt", motif_id)) %>%
  mutate(motif_id = ifelse(tf == "SOX2", "M06121_1.94d.txt", motif_id))
saveRDS(motifs_selected, "/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/motifs_selected.rds")

tf_umap <- umap(motif_cor) 
## as(<dgTMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "CsparseMatrix") instead
colnames(tf_umap$layout) <- c("A","B") 
tf_umap_plot <- tf_umap$layout %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "motif_id") %>% 
  left_join(all_motifs) %>%
  left_join(tf_clusters) %>%
  #left_join(lambert_clusters) %>%
  mutate(selected = ifelse(tf %in% motifs_selected$tf, "Yes", "No")) %>%
  mutate(cluster2 = ifelse(cluster <= 10, cluster, ">10")) %>%
  mutate(family2 = ifelse(family %in% c("bHLH", "bZIP", "Ets", "Forkhead", "GATA", "Homeodomain", "Nuclear receptor",
                                        "T-box", "C2H2 ZF"), family, "other"))
## Joining, by = "motif_id"
## Joining, by = "motif_id"
#Plot UMAP
ggplot(,aes(x = A, y = B)) +
  stat_density2d(data = tf_umap_plot, geom = "polygon", aes(alpha = ..level..), bins = 5) +
  geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = 1 , color = "grey") + 
  geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 4 , color = "#E07858") + 
  theme_pubr() +
  scale_alpha(range = c(0.05,0.15)) +
  geom_text_repel(data = tf_umap_plot %>% filter(selected == "Yes"), aes(label = tf), force = 86) + 
  ylab("UMAP2") + 
  xlab("UMAP1")

ggplot(,aes(x = A, y = B)) +
  geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = .4 , color = "grey") + 
  geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 1, color = "#E07858") + 
  theme_pubr() +
  ylab("UMAP2") + 
  xlab("UMAP1")

x <- ggplot(,aes(x = A, y = B, label = tf)) +
  geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = .4 , color = "grey") + 
  geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 1, color = "#E07858") + 
  theme_pubr() +
  ylab("UMAP2") + 
  xlab("UMAP1")

ggplotly(x)
ggplot(,aes(x = A, y = B)) +
  stat_density2d(data = tf_umap_plot, geom = "polygon", aes(alpha = ..level..), bins = 5) +
  geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = 1 , aes(color = family2)) + 
  geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 4 , color = "#E07858") + 
  theme_pubr() +
  scale_alpha(range = c(0.05,0.15)) +
  geom_text_repel(data = tf_umap_plot %>% filter(selected == "Yes"), aes(label = tf), force = 86) + 
  ylab("UMAP2") + 
  xlab("UMAP1")

Conclusion: Motifs represent large diversity within the complete motif collection.

Fig S1: Similarity heatmap of chosen motifs

motif_similarity_df <- data.frame(motif_sim) %>%
  rownames_to_column("motif1") %>%
  pivot_longer(cols = -motif1, names_to = "motif2", values_to = "similarity") %>%
  mutate(similarity = as.numeric(similarity)) 

motif_similarity_df_selected <- motif_similarity_df %>%
  filter(motif1 %in% motifs_selected$motif_id & motif2 %in% motifs_selected$motif_id) %>%
  replace(is.na(.), 0)

motif_similarity_df_selected <- motif_similarity_df_selected %>%
  left_join(motifs_selected %>% dplyr::select("motif1" = motif_id, "tf1" = tf)) %>%
  left_join(motifs_selected %>% dplyr::select("motif2" = motif_id, "tf2" = tf)) %>%
  distinct(tf1, tf2, similarity)
## Joining, by = "motif1"
## Joining, by = "motif2"
motif_similarity_mat <- motif_similarity_df_selected %>%
  spread(key = "tf2", value = "similarity") %>%
  column_to_rownames("tf1")

ord <- hclust( dist(motif_similarity_mat, method = "euclidean"), method = "ward.D2" )$order

motif_similarity_df_selected <- motif_similarity_df_selected %>%
  mutate(tf1 = factor(tf1, levels = rownames(motif_similarity_mat)[ord]),
         tf2 = factor(tf2, levels = rownames(motif_similarity_mat)[ord]))

ggplot(motif_similarity_df_selected,
       aes(x = tf1, y = tf2, fill = similarity)) +
  geom_tile(size = .5) +
  coord_fixed() +
  theme_pubr(x.text.angle = 90, border = T) +
  scale_fill_gradient2(low = "#B7B7A4", mid = "white", high = "#E17B5C", midpoint = 0.5)

ggplot(motif_similarity_df_selected,
       aes(x = tf1, y = tf2, fill = similarity)) +
  geom_tile(size = .5) +
  coord_fixed() +
  theme_pubr(x.text.angle = 90, border = T) +
  scale_fill_gradient2(low = "white", high = "#E17B5C", midpoint = 0.4)

Conclusion: We have some TFs with overlapping motifs. However, those are mostly nuclear receptors that have specific stimulations.

Session Info

paste("Run time: ",format(Sys.time()-StartTime))
## [1] "Run time:  1.057671 mins"
getwd()
## [1] "/DATA/usr/m.trauernicht/projects/SuRE-TF/library_design"
date()
## [1] "Wed May 15 16:27:27 2024"
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] stats4    parallel  grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] umap_0.2.8.0                MotifDb_1.32.0             
##  [3] ggpubr_0.4.0                biomaRt_2.46.3             
##  [5] ENCODExplorer_2.16.0        LncFinder_1.1.5            
##  [7] RcmdrMisc_2.7-2             sandwich_3.0-1             
##  [9] car_3.0-12                  carData_3.0-5              
## [11] sjmisc_2.8.9                ggrepel_0.9.1              
## [13] ggbeeswarm_0.6.0            vwr_0.3.0                  
## [15] latticeExtra_0.6-29         lattice_0.20-41            
## [17] stringdist_0.9.8            data.table_1.14.2          
## [19] RColorBrewer_1.1-3          ggseqlogo_0.1              
## [21] tibble_3.1.6                pheatmap_1.0.12            
## [23] heatmaply_1.3.0             viridis_0.6.2              
## [25] viridisLite_0.4.0           plotly_4.10.0              
## [27] tidyr_1.2.0                 stringr_1.4.0              
## [29] readr_2.1.2                 dplyr_1.0.8                
## [31] magrittr_2.0.3              phylotools_0.2.2           
## [33] ape_5.6-2                   DNABarcodes_1.20.0         
## [35] Matrix_1.5-1                gtools_3.9.2               
## [37] SimRAD_0.96                 zlibbioc_1.36.0            
## [39] ShortRead_1.48.0            GenomicAlignments_1.26.0   
## [41] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [43] MatrixGenerics_1.2.1        matrixStats_0.62.0         
## [45] Rsamtools_2.6.0             GenomicRanges_1.42.0       
## [47] GenomeInfoDb_1.26.7         BiocParallel_1.24.1        
## [49] Biostrings_2.58.0           XVector_0.30.0             
## [51] IRanges_2.24.1              S4Vectors_0.28.1           
## [53] BiocGenerics_0.36.1         universalmotif_1.8.5       
## [55] seqLogo_1.56.0              seqinr_4.2-8               
## [57] ggplot2_3.4.0              
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                rtracklayer_1.50.0           
##   [3] ModelMetrics_1.2.2.2          bit64_4.0.5                  
##   [5] knitr_1.38                    DelayedArray_0.16.3          
##   [7] rpart_4.1-15                  hwriter_1.3.2.1              
##   [9] hardhat_0.2.0                 RCurl_1.98-1.6               
##  [11] generics_0.1.2                RSQLite_2.2.12               
##  [13] proxy_0.4-26                  future_1.25.0                
##  [15] bit_4.0.4                     tzdb_0.3.0                   
##  [17] webshot_0.5.3                 xml2_1.3.3                   
##  [19] lubridate_1.8.0               httpuv_1.6.5                 
##  [21] isoband_0.2.5                 assertthat_0.2.1             
##  [23] gower_1.0.0                   xfun_0.30                    
##  [25] hms_1.1.1                     jquerylib_0.1.4              
##  [27] evaluate_0.15                 promises_1.2.0.1             
##  [29] TSP_1.2-0                     fansi_1.0.3                  
##  [31] progress_1.2.2                dendextend_1.15.2            
##  [33] dbplyr_2.1.1                  readxl_1.4.0                 
##  [35] DBI_1.1.2                     htmlwidgets_1.5.4            
##  [37] purrr_0.3.4                   ellipsis_0.3.2               
##  [39] crosstalk_1.2.0               RSpectra_0.16-1              
##  [41] backports_1.4.1               insight_0.18.2               
##  [43] vctrs_0.5.1                   sjlabelled_1.2.0             
##  [45] abind_1.4-5                   caret_6.0-92                 
##  [47] cachem_1.0.6                  withr_2.5.0                  
##  [49] checkmate_2.1.0               prettyunits_1.1.1            
##  [51] cluster_2.1.1                 lazyeval_0.2.2               
##  [53] crayon_1.5.1                  labeling_0.4.2               
##  [55] recipes_0.2.0                 pkgconfig_2.0.3              
##  [57] nlme_3.1-152                  vipor_0.4.5                  
##  [59] seriation_1.3.5               nnet_7.3-17                  
##  [61] rlang_1.0.6                   globals_0.14.0               
##  [63] lifecycle_1.0.3               registry_0.5-1               
##  [65] BiocFileCache_1.14.0          AnnotationHub_2.22.1         
##  [67] cellranger_1.1.0              zoo_1.8-10                   
##  [69] base64enc_0.1-3               beeswarm_0.4.0               
##  [71] png_0.1-7                     bitops_1.0-7                 
##  [73] splitstackshape_1.4.8         pROC_1.18.0                  
##  [75] blob_1.2.3                    parallelly_1.31.1            
##  [77] jpeg_0.1-9                    rstatix_0.7.0                
##  [79] ggsignif_0.6.3                scales_1.2.0                 
##  [81] memoise_2.0.1                 plyr_1.8.7                   
##  [83] compiler_4.0.5                cli_3.4.1                    
##  [85] ade4_1.7-19                   listenv_0.8.0                
##  [87] htmlTable_2.4.0               Formula_1.2-4                
##  [89] MASS_7.3-53.1                 tidyselect_1.1.2             
##  [91] stringi_1.7.6                 forcats_0.5.1                
##  [93] highr_0.9                     yaml_2.3.5                   
##  [95] askpass_1.1                   sass_0.4.1                   
##  [97] tools_4.0.5                   future.apply_1.8.1           
##  [99] rstudioapi_0.13               BH_1.78.0-0                  
## [101] foreach_1.5.2                 foreign_0.8-81               
## [103] gridExtra_2.3                 prodlim_2019.11.13           
## [105] farver_2.1.0                  digest_0.6.29                
## [107] BiocManager_1.30.19           shiny_1.7.1                  
## [109] lava_1.6.10                   nortest_1.0-4                
## [111] Rcpp_1.0.8.3                  broom_0.8.0                  
## [113] BiocVersion_3.12.0            later_1.3.0                  
## [115] httr_1.4.2                    AnnotationDbi_1.52.0         
## [117] Rdpack_2.3                    colorspace_2.0-3             
## [119] reticulate_1.24               XML_3.99-0.9                 
## [121] splines_4.0.5                 xtable_1.8-4                 
## [123] jsonlite_1.8.0                timeDate_3043.102            
## [125] ipred_0.9-12                  R6_2.5.1                     
## [127] Hmisc_4.7-0                   pillar_1.7.0                 
## [129] htmltools_0.5.2               mime_0.12                    
## [131] glue_1.6.2                    fastmap_1.1.0                
## [133] class_7.3-18                  interactiveDisplayBase_1.28.0
## [135] codetools_0.2-18              utf8_1.2.2                   
## [137] bslib_0.3.1                   curl_4.3.2                   
## [139] openssl_2.0.0                 survival_3.2-10              
## [141] rmarkdown_2.13                munsell_0.5.0                
## [143] e1071_1.7-9                   GenomeInfoDbData_1.2.4       
## [145] iterators_1.0.14              haven_2.5.0                  
## [147] reshape2_1.4.4                gtable_0.3.0                 
## [149] rbibutils_2.2.8